####################################################################################
## “Do Provinces that Win the Spanish Christmas Lottery Experience a Surge in Incumbent
## Popularity? An Out-of-Sample Replication of Bagues and Esteve-Volart (2016)”,
## by Carolina Bernal, Donald P. Green, and David Vilalta
## What for: Creates Figure 3 in the paper
####################################################################################
library(fixest)
library(ggplot2)
library(dplyr)
library(car)
library(haven)
# Load the dataset
file_path <- ".../Data/Our_data/20240723_SpanishLottery_Complete_province.dta"
data <- read_dta(file_path)
####################################################################################
## “Do Provinces that Win the Spanish Christmas Lottery Experience a Surge in Incumbent
## Popularity? An Out-of-Sample Replication of Bagues and Esteve-Volart (2016)”,
## by Carolina Bernal, Donald P. Green, and David Vilalta
## What for: Creates Figure 3 in the paper
####################################################################################
library(fixest)
library(ggplot2)
library(dplyr)
library(car)
library(haven)
# Load the dataset
file_path <- "/Users/carolinabernal/Library/CloudStorage/Dropbox/Spanish_Lottery/StataDos_RScripts/202406 Analysis/Ours_replication_pkg/NEW AND FINAL REPPKG/Data/Our_data/20250531_SpanishLottery_Complete_province.dta"
data <- read_dta(file_path)
View(data)
complete_data <- data %>%
mutate(D_housing_price_2 = if_else(is.na(D_housing_price_2), housing_price_growth_term_1, D_housing_price_2)) %>%
filter(year < 2009)
# Ensure the dataset does not contain any NAs in the variables used in the models
complete_data <- complete_data %>%
select(province, year, var_votes_inc_ours, top_prizes_gdp_term_2, expenditure_gdp_term_2,
D_unemployment_rate_2, D_gdp_pc_2, D_housing_price_2, D_cpi_2, province_num, population) %>%
na.omit()
# Fit the full model
full_model <- lm(var_votes_inc_ours ~ top_prizes_gdp_term_2 +
expenditure_gdp_term_2 * factor(year) +
D_unemployment_rate_2 +
D_gdp_pc_2 +
D_housing_price_2 +
D_cpi_2 +
factor(year) +
factor(province_num),
data = complete_data,
weights = population)
# Generate the added variable plot and capture the plot data
avPlot <- avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 0, pch = 19, cex = 0.5, col = "blue", plot = FALSE)
avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 4, pch = 19, cex = 0.5, col = "blue")
dev.off()
# Extract the plot data
plot_data <- as.data.frame(avPlot[[1]])
plot_data$population <- complete_data$population
plot_data$row_number <- rownames(plot_data)  # Add row numbers
# Create the plot using ggplot2
p <- ggplot(plot_data, aes(x = top_prizes_gdp_term_2, y = var_votes_inc_ours, size = population)) +
geom_point(alpha = 0.5, color = "brown") +
geom_smooth(method = "lm", se = TRUE, fill = "blue", alpha = 0.2) +  # Adjust transparency
labs(x = "Prizes as % of GDP | X",
y = "Delta Incumbent Votes | X",
title = "",
size = "Population") +
theme_minimal() +
theme(
legend.position = "none")
p
####################################################################################
## “Do Provinces that Win the Spanish Christmas Lottery Experience a Surge in Incumbent
## Popularity? An Out-of-Sample Replication of Bagues and Esteve-Volart (2016)”,
## by Carolina Bernal, Donald P. Green, and David Vilalta
## What for: Creates Figure 3 in the paper
####################################################################################
library(fixest)
library(ggplot2)
library(dplyr)
library(car)
library(haven)
# Load the dataset
file_path <- "/Users/carolinabernal/Library/CloudStorage/Dropbox/Spanish_Lottery/Data/Our_data/Data_electoral_Graph_3_NOE.dta"
data <- read_dta(file_path)
################################################################################
# Replication period
################################################################################
complete_data <- data %>%
mutate(D_housing_price_2 = if_else(is.na(D_housing_price_2), housing_price_growth_term_1, D_housing_price_2)) %>%
filter(year < 2009)
# Ensure the dataset does not contain any NAs in the variables used in the models
complete_data <- complete_data %>%
select(province, year, var_votes_inc_ours, top_prizes_gdp_term_2, expenditure_gdp_term_2,
D_unemployment_rate_2, D_gdp_pc_2, D_housing_price_2, D_cpi_2, province_num, population) %>%
na.omit()
# Fit the full model
full_model <- lm(var_votes_inc_ours ~ top_prizes_gdp_term_2 +
expenditure_gdp_term_2 * factor(year) +
D_unemployment_rate_2 +
D_gdp_pc_2 +
D_housing_price_2 +
D_cpi_2 +
factor(year) +
factor(province_num),
data = complete_data,
weights = population)
# Generate the added variable plot and capture the plot data
avPlot <- avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 0, pch = 19, cex = 0.5, col = "blue", plot = FALSE)
avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 4, pch = 19, cex = 0.5, col = "blue")
dev.off()
# Extract the plot data
plot_data <- as.data.frame(avPlot[[1]])
plot_data$population <- complete_data$population
plot_data$row_number <- rownames(plot_data)  # Add row numbers
# Create the plot using ggplot2
p <- ggplot(plot_data, aes(x = top_prizes_gdp_term_2, y = var_votes_inc_ours, size = population)) +
geom_point(alpha = 0.5, color = "brown") +
geom_smooth(method = "lm", se = TRUE, fill = "blue", alpha = 0.2) +  # Adjust transparency
labs(x = "Prizes as % of GDP | X",
y = "Delta Incumbent Votes | X",
title = "",
size = "Population") +
theme_minimal() +
theme(
legend.position = "none")
p
complete_data <- data %>%
mutate(D_housing_price_2 = if_else(is.na(D_housing_price_2), housing_price_growth_term_1, D_housing_price_2)) %>%
filter(year > 2008)
# Ensure the dataset does not contain any NAs in the variables used in the models
complete_data <- complete_data %>%
select(province, year, var_votes_inc_ours, top_prizes_gdp_term_2, expenditure_gdp_term_2,
D_unemployment_rate_2, D_gdp_pc_2, D_housing_price_2, D_cpi_2, province_num, population) %>%
na.omit()
# Fit the full model
full_model <- lm(var_votes_inc_ours ~ top_prizes_gdp_term_2 +
expenditure_gdp_term_2 * factor(year) +
D_unemployment_rate_2 +
D_gdp_pc_2 +
D_housing_price_2 +
D_cpi_2 +
factor(year) +
factor(province_num),
data = complete_data,
weights = population)
# Generate the added variable plot and capture the plot data
avPlot <- avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 0, pch = 19, cex = 0.5, col = "blue", plot = FALSE)
avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 4, pch = 19, cex = 0.5, col = "blue")
dev.off()
# Extract the plot data
plot_data <- as.data.frame(avPlot[[1]])
plot_data$population <- complete_data$population
plot_data$row_number <- rownames(plot_data)  # Add row numbers
# Create the plot using ggplot2
p <- ggplot(plot_data, aes(x = top_prizes_gdp_term_2, y = var_votes_inc_ours, size = population)) +
geom_point(alpha = 0.5, color = "brown") +
geom_smooth(method = "lm", se = TRUE, fill = "blue", alpha = 0.2) +  # Adjust transparency
labs(x = "Prizes as % of GDP | X",
y = "Delta Incumbent Votes | X",
title = "",
size = "Population") +
theme_minimal() +
theme(
legend.position = "none")
p
####################################################################################
## “Do Provinces that Win the Spanish Christmas Lottery Experience a Surge in Incumbent
## Popularity? An Out-of-Sample Replication of Bagues and Esteve-Volart (2016)”,
## by Carolina Bernal, Donald P. Green, and David Vilalta
## What for: Creates Figure 3 in the paper
####################################################################################
library(fixest)
library(ggplot2)
library(dplyr)
library(car)
library(haven)
# =========================
# Paths (edit only BASE_DIR)
# =========================
BASE_DIR <- "C:/Users/david/Dropbox/Spanish_Lottery/StataDos_RScripts/202406 Analysis/Ours_replication_pkg/NEW AND FINAL REPPKG"
DATA_FILE <- file.path(
BASE_DIR, "Data", "Our_data",
"20240723_SpanishLottery_Complete_province.dta"
)
FIG_DIR <- file.path(BASE_DIR, "Results", "Figures_Electoral")
dir.create(FIG_DIR, recursive = TRUE, showWarnings = FALSE)
# Load the dataset
data <- read_dta(DATA_FILE)
dir.exists(BASE_DIR)
dir.exists(file.path(BASE_DIR, "Data", "Our_data"))
file.exists(DATA_FILE)
####################################################################################
## “Do Provinces that Win the Spanish Christmas Lottery Experience a Surge in Incumbent
## Popularity? An Out-of-Sample Replication of Bagues and Esteve-Volart (2016)”,
## by Carolina Bernal, Donald P. Green, and David Vilalta
## What for: Creates Figure 3 in the paper
####################################################################################
library(fixest)
library(ggplot2)
library(dplyr)
library(car)
library(haven)
# =========================
# Paths (edit only BASE_DIR)
# =========================
BASE_DIR <- "C:/Users/david/Dropbox/Spanish_Lottery/StataDos_RScripts/202406 Analysis/Ours_replication_pkg/NEW AND FINAL REPPKG"
DATA_FILE <- file.path(
BASE_DIR, "Data", "Our_data",
"20250531_SpanishLottery_Complete_province.dta"
)
FIG_DIR <- file.path(BASE_DIR, "Results", "Figures_Electoral")
dir.create(FIG_DIR, recursive = TRUE, showWarnings = FALSE)
# Load the dataset
data <- read_dta(DATA_FILE)
# Helper: quietly close a plotting device if one is open
close_device_if_open <- function() {
if (dev.cur() > 1) try(dev.off(), silent = TRUE)
}
################################################################################
# Replication period  (Figure 3a)
################################################################################
complete_data <- data %>%
mutate(D_housing_price_2 = if_else(is.na(D_housing_price_2), housing_price_growth_term_1, D_housing_price_2)) %>%
filter(year < 2009) %>%
select(province, year, var_votes_inc_ours, top_prizes_gdp_term_2, expenditure_gdp_term_2,
D_unemployment_rate_2, D_gdp_pc_2, D_housing_price_2, D_cpi_2, province_num, population) %>%
na.omit()
full_model <- lm(
var_votes_inc_ours ~ top_prizes_gdp_term_2 +
expenditure_gdp_term_2 * factor(year) +
D_unemployment_rate_2 +
D_gdp_pc_2 +
D_housing_price_2 +
D_cpi_2 +
factor(year) +
factor(province_num),
data = complete_data,
weights = population
)
avPlot <- avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 0, pch = 19, cex = 0.5, col = "blue", plot = FALSE)
avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 4, pch = 19, cex = 0.5, col = "blue")
close_device_if_open()
plot_data <- as.data.frame(avPlot[[1]])
plot_data$population <- complete_data$population
plot_data$row_number <- rownames(plot_data)
p_rep <- ggplot(plot_data, aes(x = top_prizes_gdp_term_2, y = var_votes_inc_ours, size = population)) +
geom_point(alpha = 0.5, color = "brown") +
geom_smooth(method = "lm", se = TRUE, fill = "blue", alpha = 0.2) +
labs(x = "Prizes as % of GDP | X",
y = "Delta Incumbent Votes | X",
title = "",
size = "Population") +
theme_minimal() +
theme(legend.position = "none")
ggsave(filename = file.path(FIG_DIR, "figure3a.png"), plot = p_rep, width = 6.5, height = 4.5, dpi = 300)
################################################################################
# Out-of-sample period  (Figure 3b)
################################################################################
complete_data <- data %>%
mutate(D_housing_price_2 = if_else(is.na(D_housing_price_2), housing_price_growth_term_1, D_housing_price_2)) %>%
filter(year > 2008) %>%
select(province, year, var_votes_inc_ours, top_prizes_gdp_term_2, expenditure_gdp_term_2,
D_unemployment_rate_2, D_gdp_pc_2, D_housing_price_2, D_cpi_2, province_num, population) %>%
na.omit()
full_model <- lm(
var_votes_inc_ours ~ top_prizes_gdp_term_2 +
expenditure_gdp_term_2 * factor(year) +
D_unemployment_rate_2 +
D_gdp_pc_2 +
D_housing_price_2 +
D_cpi_2 +
factor(year) +
factor(province_num),
data = complete_data,
weights = population
)
avPlot <- avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 0, pch = 19, cex = 0.5, col = "blue", plot = FALSE)
avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 4, pch = 19, cex = 0.5, col = "blue")
close_device_if_open()
plot_data <- as.data.frame(avPlot[[1]])
plot_data$population <- complete_data$population
plot_data$row_number <- rownames(plot_data)
p_oos <- ggplot(plot_data, aes(x = top_prizes_gdp_term_2, y = var_votes_inc_ours, size = population)) +
geom_point(alpha = 0.5, color = "brown") +
geom_smooth(method = "lm", se = TRUE, fill = "blue", alpha = 0.2) +
labs(x = "Prizes as % of GDP | X",
y = "Delta Incumbent Votes | X",
title = "",
size = "Population") +
theme_minimal() +
theme(legend.position = "none")
ggsave(filename = file.path(FIG_DIR, "figure3b.png"), plot = p_oos, width = 6.5, height = 4.5, dpi = 300)
################################################################################
# Pooled  (Figure 3c)
################################################################################
complete_data <- data %>%
mutate(D_housing_price_2 = if_else(is.na(D_housing_price_2), housing_price_growth_term_1, D_housing_price_2)) %>%
select(province, year, var_votes_inc_ours, top_prizes_gdp_term_2, expenditure_gdp_term_2,
D_unemployment_rate_2, D_gdp_pc_2, D_housing_price_2, D_cpi_2, province_num, population) %>%
na.omit()
full_model <- lm(
var_votes_inc_ours ~ top_prizes_gdp_term_2 +
expenditure_gdp_term_2 * factor(year) +
D_unemployment_rate_2 +
D_gdp_pc_2 +
D_housing_price_2 +
D_cpi_2 +
factor(year) +
factor(province_num),
data = complete_data,
weights = population
)
avPlot <- avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 0, pch = 19, cex = 0.5, col = "blue", plot = FALSE)
avPlots(full_model, terms = "top_prizes_gdp_term_2", id.n = 4, pch = 19, cex = 0.5, col = "blue")
close_device_if_open()
plot_data <- as.data.frame(avPlot[[1]])
plot_data$population <- complete_data$population
plot_data$row_number <- rownames(plot_data)
p_pool <- ggplot(plot_data, aes(x = top_prizes_gdp_term_2, y = var_votes_inc_ours, size = population)) +
geom_point(alpha = 0.5, color = "brown") +
geom_smooth(method = "lm", se = TRUE, fill = "blue", alpha = 0.2) +
labs(x = "Prizes as % of GDP | X",
y = "Delta Incumbent Votes | X",
title = "",
size = "Population") +
theme_minimal() +
theme(legend.position = "none")
ggsave(filename = file.path(FIG_DIR, "figure3c.png"), plot = p_pool, width = 6.5, height = 4.5, dpi = 300)
rm(list=ls())
library(dplyr)
library(ggplot2)
library(tidyr)
library(lfe)
library(fixest)
#setwd("/Users/carolinabernal/Library/CloudStorage/Dropbox/Spanish_Lottery/StataDos_RScripts/202406 Analysis/Ours_replication_pkg/NEW AND FINAL REPPKG")
data <- read.csv("Data/Our_data/20240814_SurveyData_for_simulations.csv", header=TRUE)
setwd("/Users/carolinabernal/Library/CloudStorage/Dropbox/Spanish_Lottery/StataDos_RScripts/202406 Analysis/Ours_replication_pkg/NEW AND FINAL REPPKG")
install.packages("rlogging")
install.packages("logr")
Sys.time()
library(logr)
log_open(file_name = "Appendix_SurveyEvidence_Figures_log")
## Popularity? An Out-of-Sample Replication of Bagues and Esteve-Volart (2016)”,
## by Carolina Bernal, Donald P. Green, and David Vilalta
## Script author: Carolina Bernal
## What for: Run simulations to check whether inputting zeroes to
## prizes and expenditure in the last 3 quarters of the year biases estimation
####################################################################################
rm(list=ls())
library(dplyr)
library(ggplot2)
library(tidyr)
library(lfe)
library(fixest)
data <- read.csv("Data/Our_data/20240814_SurveyData_for_simulations.csv", header=TRUE)
Sys.time()
data$imputed <- ifelse(data$month <= 3, 0 , 1)
library(dplyr)
# Define the number of simulations
n_sims <- 10000 # Be aware that running the 10K sims take several hours.
# 1) Discretize (Bin) the Continuous `expenditure_all` Variable
# Discretize expenditure_all into bins (e.g., using quantiles)
data <- data %>%
mutate(expenditure_bin = cut(expenditure_all,
breaks = quantile(expenditure_all, probs = seq(0, 1, by = 0.2), na.rm = TRUE),
include.lowest = TRUE,
labels = FALSE))
# 2) Function to conduct randomization inference with a clustered, continuous treatment and conditional assignment
run_randomization_inference_clustered <- function(data) {
# Function to calculate the ATE for a given shuffled treatment
calc_ate <- function(data) {
# Fit a linear model to estimate the treatment effect controlling for expenditure_all
model <-     feols(
vote_incumbent ~ top_prizes_gdp + expenditure_all + as.factor(municipality_size) + as.factor(female) + as.factor(education) + as.factor(status) + age | fixed_effect + survey,
cluster = c("province"),
data = data)
ate <- coef(model)["top_prizes_gdp"]
return(ate)
}
# Run the randomization inference
sampling_distribution <- replicate(n_sims, {
# Shuffle treatment at the province level within expenditure bins
shuffled_data <- data %>%
group_by(expenditure_bin, province) %>%
mutate(top_prizes_gdp = sample(top_prizes_gdp, size = n(), replace = FALSE)) %>%
ungroup()
calc_ate(shuffled_data)
})
return(sampling_distribution)
}
set.seed(123456)
# 3) Randomization inference with all data (including imputed zeroes)
sampling_distribution_with_imputed <- run_randomization_inference_clustered(data)
# 4) Randomization inference without imputed zeroes
data_no_imputed <- data %>% filter(imputed == 0)
sampling_distribution_without_imputed <- run_randomization_inference_clustered(data_no_imputed)
# 5) Analyze the sampling distributions
mean_with_imputed <- mean(sampling_distribution_with_imputed)
mean_without_imputed <- mean(sampling_distribution_without_imputed)
cat("Mean of sampling distribution with imputed zeroes:", mean_with_imputed, "\n")
cat("Mean of sampling distribution without imputed zeroes:", mean_without_imputed, "\n")
# Check the number of unique values in the sampling distributions
cat("Number of unique values in sampling_distribution_with_imputed:", length(unique(sampling_distribution_with_imputed)), "\n")
cat("Number of unique values in sampling_distribution_without_imputed:", length(unique(sampling_distribution_without_imputed)), "\n")
# 6) Plot the sampling distributions separately using ggplot2 colors
# Convert the sampling distributions into data frames for ggplot2
hist_data_with_imputed <- data.frame(ATE = sampling_distribution_with_imputed)
hist_data_without_imputed <- data.frame(ATE = sampling_distribution_without_imputed)
### PLOTS:
# 7) Histogram for 'sampling_distribution_with_imputed'
ggplot(hist_data_with_imputed, aes(x = ATE)) +
geom_histogram(fill = "blue", bins = 20, alpha = 0.5) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) +
labs(title = "Sampling Distribution with Imputed Zeroes",
x = "ATE",
y = "Frequency") +
theme_minimal()
# 8) Histogram for 'sampling_distribution_without_imputed'
ggplot(hist_data_without_imputed, aes(x = ATE)) +
geom_histogram(fill = "green", bins = 20, alpha = 0.5) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) +
labs(title = "Sampling Distribution without Imputed Zeroes",
x = "ATE",
y = "Frequency") +
theme_minimal()
# 9) Combine the two histograms in a single plot
# Create a data frame for combined histograms
hist_data <- data.frame(
ATE = c(sampling_distribution_with_imputed, sampling_distribution_without_imputed),
Group = c(rep("With Imputed Zeroes", length(sampling_distribution_with_imputed)),
rep("Without Imputed Zeroes", length(sampling_distribution_without_imputed)))
)
# Plot combined histograms
ggplot(hist_data, aes(x = ATE, fill = Group)) +
geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
scale_fill_manual(values = c("blue", "green")) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed", size = 1) +
labs(title = "Comparison of Sampling Distributions",
x = "ATE",
y = "Frequency",
fill = "Data") +
theme_minimal() +
theme(legend.position = "bottom")  # Move legend below
# Check the size of the filtered dataset
cat("Number of observations in data_no_imputed:", nrow(data_no_imputed), "\n")
# Check the variability of the treatment in the filtered dataset
summary(data_no_imputed$top_prizes_gdp)
table(data_no_imputed$top_prizes_gdp)
# Check for variability within the expenditure bins and provinces
table(data_no_imputed$expenditure_bin, data_no_imputed$province)
# Calculate standard error
se_with_imputed <- sd(sampling_distribution_with_imputed) / sqrt(length(sampling_distribution_with_imputed))
se_without_imputed <- sd(sampling_distribution_without_imputed) / sqrt(length(sampling_distribution_without_imputed))
# Calculate 95% confidence intervals
ci_with_imputed <- mean(sampling_distribution_with_imputed) + c(-1.96, 1.96) * se_with_imputed
ci_without_imputed <- mean(sampling_distribution_without_imputed) + c(-1.96, 1.96) * se_without_imputed
# Output the results to build Table 13 in the Appendix:
cat("Mean of sampling distribution with imputed zeroes:", mean_with_imputed, "\n")
cat("Mean of sampling distribution without imputed zeroes:", mean_without_imputed, "\n")
cat("Standard Error with imputed zeroes:", se_with_imputed, "\n")
cat("95% Confidence Interval with imputed zeroes:", ci_with_imputed, "\n")
cat("Does CI with imputed zeroes exclude zero?", !between(0, ci_with_imputed[1], ci_with_imputed[2]), "\n\n")
cat("Standard Error without imputed zeroes:", se_without_imputed, "\n")
cat("95% Confidence Interval without imputed zeroes:", ci_without_imputed, "\n")
cat("Does CI without imputed zeroes exclude zero?", !between(0, ci_without_imputed[1], ci_without_imputed[2]), "\n")
save.image(file='Data/Simulation_AllCtrols_10K.RData')
Sys.time()
# load('Data/Simulation_AllCtrols_10K.RData')  ## load this if you want to check the simulation results but have no time to run it yourself
# Close log
log_close()
savehistory()
